################################################################################
### Politicians' Complaint Response: E-Governance and Personal Relationships ###
### Question Text Analysis                                                   ###
### William O'Brochta                                                        ###
### Louisiana Tech University                                                ###
################################################################################
#This file replicates only Figures 4.1-4.3 and Tables 4.4-4.5.
#See MC Survey Analysis.R to replicate all other figures and tables.


library(tidyverse)        
library(quanteda)         
library(stm)              
library(stminsights)      
library(wordcloud)        
library(gsl)             
library(topicmodels)      
library(caret)            
library(Rtsne)
library(geometry)
library(rsvd)
library(xtable)

load("delhiquestions3.RData")

#Remove questions where no details are available
delhiquestions3<-delhiquestions3[delhiquestions3$Question!="Questions Detailed are not available",]
delhiquestions4<-delhiquestions3[!is.na(delhiquestions3$ComplaintCoding),]

#Keep duplicate questions
corpus1<-corpus(delhiquestions4, text_field="Question")
docvars(corpus1, "Question_No") <-
  sprintf("%02d", 1:ndoc(corpus1)) 

corpus1.stats <- summary(corpus1)

token <-
  tokens(
    corpus1,
    remove_numbers = TRUE,
    remove_punct = TRUE,
    remove_symbols = TRUE,
    what = "word",
    remove_url = TRUE,
    split_hyphens = TRUE,
    include_docvars = TRUE
  )


token_ungd <- tokens_select(
  token,
  c("[\\d-]", "[[:punct:]]", "^.{1,2}$",
    "ward", "also", "can", "said", "ndmc",
    "edmc", "sdmc", "committee", "corporator",
    "municipal", "councillor", "office", "delhi",
    "state", "issue", "meeting", "meet", "standing",
    "etc"),
  selection = "remove",
  valuetype = "regex",
  verbose = TRUE
)

mydfm <- dfm(token_ungd,
             tolower = TRUE,
             stem = TRUE,
             remove = stopwords("english")
)


mydfm.trim <-
  dfm_trim(
    mydfm,
    min_docfreq = 0.01,
    # min 1.0%
    max_docfreq = 0.90,
    #  max 90%
    docfreq_type = "prop"
  )

head(dfm_sort(mydfm.trim, decreasing = TRUE, margin = "both"),
     n = 10) 

dfm2stm <- convert(mydfm.trim, to = "stm")


#STM 
model.stm <- stm(
  dfm2stm$documents,
  dfm2stm$vocab, max.em.its = 500, emtol=0.0005,
  K = 35,
  data = dfm2stm$meta, seed=123,
  prevalence= ~Committee_Name+Corporation+ComplaintCoding,
  init.type = "Spectral"
)

#save(model.stm, file="modelstm.RData")
#load("modelstm.RData")

labelTopics(model.stm)

stmtopics<-as.data.frame(labelTopics(model.stm)$prob)
probability <- colMeans(model.stm$theta)
topicnum<-1:35
stmtopics<-as.data.frame(cbind(topicnum, stmtopics, probability))
stmtopics<-stmtopics[order(-stmtopics$probability),]

#Table SI.4.4
print(xtable(stmtopics, type='latex', row.names=F), file='stmtopics.tex', include.rownames=FALSE)

#Figure SI.4.1, Panel A
plot.STM(model.stm,type="summary", main="STM Topics", xlim=c(0,0.20),cex.axis=1.5, cex.main=1.5, cex.lab=1.5)

exclusivity <- function(mod.out, M=10, frexw=.7){
  w <- frexw
  if(length(mod.out$beta$logbeta)!=1) stop("Exclusivity calculation only designed for models without content covariates")
  tbeta <- t(exp(mod.out$beta$logbeta[[1]]))
  s <- rowSums(tbeta)
  mat <- tbeta/s #normed by columns of beta now.
  
  ex <- apply(mat,2,rank)/nrow(mat)
  fr <- apply(tbeta,2,rank)/nrow(mat)
  frex<- 1/(w/ex + (1-w)/fr)
  index <- apply(tbeta, 2, order, decreasing=TRUE)[1:M,]
  out <- vector(length=ncol(tbeta)) 
  for(i in 1:ncol(frex)) {
    out[i] <- sum(frex[index[,i],i])
  }
  out
}

topicQuality(model.stm,documents=dfm2stm$documents)

#Want highest held-out likelihood; lowest residuals; 
#high semantic coherence, but also high exclusivity
sk<-searchK(dfm2stm$documents,dfm2stm$vocab,K=c(10,20,30,40,50), emtol=0.0005,
            data = dfm2stm$meta, seed=123,
            prevalence= ~Committee_Name+Corporation+ComplaintCoding,
            init.type = "Spectral")
#save(sk, file="sk.RData")
#load("sk.RData")

#Figure SI.4.2
knitr::kable(sk$results)
plot(sk)

#Investigate between 32 and 42 topics
sk2<-searchK(dfm2stm$documents,dfm2stm$vocab,K=c(32,34,35,36,38,40,42), emtol=0.0005,
            data = dfm2stm$meta, seed=123,
            prevalence= ~Committee_Name+Corporation+ComplaintCoding,
            init.type = "Spectral")
#save(sk2, file="sk2.RData")
#load("sk2.RData")

knitr::kable(sk2$results)
plot(sk2)





###BTM
library(udpipe)
library(data.table)
library(stopwords)
library(BTM)
library(textplot)
library(ggraph)
library(LDAvis)

x<-delhiquestions4
x$doc_id <- row.names(x)
x$text   <- tolower(x$Question)
x$text   <- gsub("'", "", x$text)
x$text   <- gsub("<.+>", "", x$text)

anno    <- udpipe(x, "english", trace = 10)
#save(anno, file="anno.RData")

biterms <- as.data.table(anno)
biterms <- biterms[, cooccurrence(x = lemma,
                                  relevant = upos %in% c("NOUN", "ADJ", "VERB") & 
                                    nchar(lemma) > 2 & !lemma %in% stopwords("en"),
                                  skipgram = 3),
                   by = list(doc_id)]
#save(biterms, file="biterms.RData")

set.seed(123456)
traindata <- subset(anno, upos %in% c("NOUN", "ADJ", "VERB") & !lemma %in% stopwords("en") & nchar(lemma) > 2)
traindata <- traindata[, c("doc_id", "lemma")]
model9<- BTM(traindata, biterms = biterms, k = 9, iter = 2000, background = TRUE, trace = 100)
#save(model9, file="model9.RData")

#Topic Probabilities
model9$theta

#Terms associated with each topic
topicterms <- terms(model9, top_n = 10)
topicterms

library(concaveman)
plot(model9, top_n = 10,
     title = "BTM model with 9 topics")


#35 Topics
model35<- BTM(traindata, biterms = biterms, k = 35, iter = 2000, background = TRUE, trace = 100)
#save(model35, file="model35.RData")

#Topic Probabilities
model35$theta

#Terms associated with each topic
topicterms35 <- terms(model35, top_n = 10)
topicterms35

topicterms35df<-as.data.frame(topicterms35)
topicterms35df<-topicterms35df[, c(TRUE,FALSE)]
topicterms35df<-as.data.frame(t(topicterms35df))
topicterms35df$probability<-model35$theta
topicnum<-1:35
topicterms35df<-as.data.frame(cbind(topicnum,topicterms35df))
topictermslabels<-topicterms35df

topicterms35df<-topicterms35df[order(-topicterms35df$probability),]

#Table SI.4.5
print(xtable(topicterms35df, type='latex', row.names=F), file='topicterms35df.tex', include.rownames=FALSE)


#Figure SI.4.1, Panel B
topictermslabels$topicname<-paste0("Topic ",1:35, ": ", 
                                   topictermslabels$`1`, ", ",
                                   topictermslabels$`2`, ", ",
                                   topictermslabels$`3`)

frequency <- model35$theta
invrank <- order(frequency, decreasing=FALSE)
lab<-topictermslabels$topicname

plot(c(0,0), type="n", xlim=c(0,0.2), ylim=c(0,35), main="BTM Topics", 
     yaxt="n", 
     ylab="", xlab="Expected Topic Proportions", lwd=2, cex.axis=1.5, cex.lab=1.5, cex.main=1.5)
for(i in 1:length(invrank)) {
  lines(c(0,frequency[invrank[i]]), c(i, i), lwd=2)
  text(frequency[invrank[i]]+ min(2*max(frequency), 1)/100, i , lab[invrank[i]],pos=4)}


###Pick model like Santos 2018 based on highest ll
#15 Topics
model15<- BTM(traindata, biterms = biterms, k = 15, iter = 2000, background = TRUE, trace = 100)
#save(model15, file="model15.RData")

#20 Topics
model20<- BTM(traindata, biterms = biterms, k = 20, iter = 2000, background = TRUE, trace = 100)
#save(model20, file="model20.RData")

#25 Topics
model25<- BTM(traindata, biterms = biterms, k = 25, iter = 2000, background = TRUE, trace = 100)
#save(model25, file="model25.RData")

#30 Topics
model30<- BTM(traindata, biterms = biterms, k = 30, iter = 2000, background = TRUE, trace = 100)
#save(model30, file="model30.RData")



#Compare fit (higher --- less negative --- LL is better)
fit9<-logLik(model9)
fit35<-logLik(model35)
fit15<-logLik(model15)
fit20<-logLik(model20)
fit25<-logLik(model25)
fit30<-logLik(model30)

#Figure SI.4.3
btmll<-c(fit9$ll, fit15$ll, fit20$ll, fit25$ll, fit30$ll, fit35$ll)
btmmodels<-c(9,15,20,25,30,35)
par(mar=c(5.1,5.1,4.1,2.1))
plot(btmll~btmmodels, main="Log Likelihood", xlab="Number of Topics",
     ylab="Log Likelihood", cex.main=1.5, cex.axis=1.5, cex.lab=1.5, type="o", lwd=2)




